In [63]:
%matplotlib inline
from matplotlib import pyplot as plt
import matplotlib.mlab as mlab
import csv
from scipy.stats import norm
import numpy as np
import scipy.stats as stats
import pandas as pd
from mpl_toolkits.mplot3d import axes3d
import seaborn as sns
In [6]:
data = open('../data/data.csv', 'r').readlines()
fieldnames = ['x', 'y', 'z', 'unmasked', 'synapses']
reader = csv.reader(data)
reader.next()
rows = [[int(col) for col in row] for row in reader]
In [7]:
df = pd.DataFrame(rows)
In [8]:
df.columns = ['x', 'y', 'z', 'unmasked', 'synDen']
In [9]:
df = df[df.unmasked != 0]
df = df[df.synDen != 0]
In [10]:
df['synDen'] = df['synDen']/df['unmasked']
In [11]:
synData = df['synDen'].reshape(-1,1)
print synData
In [12]:
from sklearn.cluster import KMeans
In [21]:
k_range = range(1,26)
k_means_var = [KMeans(n_clusters = k).fit(synData) for k in k_range]
centroids = [X.cluster_centers_ for X in k_means_var]
In [22]:
print k_means_var
In [23]:
from scipy.spatial import distance
def compute_bic(kmeans,X):
"""
Computes the BIC metric for a given clusters
Parameters:
-----------------------------------------
kmeans: List of clustering object from scikit learn
X : multidimension np array of data points
Returns:
-----------------------------------------
BIC value
"""
# assign centers and labels
centers = [kmeans.cluster_centers_]
labels = kmeans.labels_
#number of clusters
m = kmeans.n_clusters
# size of the clusters
n = np.bincount(labels)
#size of data set
N, d = X.shape
#compute variance for all clusters beforehand
cl_var = (1.0 / (N - m) / d) * sum([sum(distance.cdist(X[np.where(labels == i)], [centers[0][i]], 'euclidean')**2) for i in range(m)])
const_term = 0.5 * m * np.log(N) * (d+1)
BIC = np.sum([n[i] * np.log(n[i]) -
n[i] * np.log(N) -
((n[i] * d) / 2) * np.log(2*np.pi*cl_var) -
((n[i] - 1) * d/ 2) for i in range(m)]) - const_term
return(BIC)
In [24]:
BIC = [compute_bic(kmeansi ,synData) for kmeansi in k_means_var]
plt.plot(k_range,BIC,'r-o')
plt.title("cluster vs BIC")
plt.xlabel("# clusters")
plt.ylabel("# BIC")
#Code From: http://www.slideshare.net/SarahGuido/kmeans-clustering-with-scikitlearn
#Code From: http://stats.stackexchange.com/questions/90769/using-bic-to-estimate-the-number-of-k-in-kmeans
Out[24]:
In [25]:
#optimum number of clusters is 6 looking at the elbow of the graph
In [13]:
km = KMeans(n_clusters = 7)
km.fit(synData)
labels = km.labels_
df['clusterGroup'] = labels
print df
In [23]:
fig = plt.figure()
ax = fig.gca(projection='3d')
df0 = df[df.clusterGroup == 0]
df1 = df[df.clusterGroup == 1]
df2 = df[df.clusterGroup == 2]
df3 = df[df.clusterGroup == 3]
df4 = df[df.clusterGroup == 4]
df5 = df[df.clusterGroup == 5]
df6 = df[df.clusterGroup == 6]
ax.scatter(df0['x'],df0['y'],df0['z'], c = 'b')
ax.scatter(df1['x'],df1['y'],df1['z'], c = 'g')
ax.scatter(df2['x'],df2['y'],df2['z'], c = 'r')
ax.scatter(df3['x'],df3['y'],df3['z'], c = 'c')
ax.scatter(df4['x'],df4['y'],df4['z'], c = 'm')
ax.scatter(df5['x'],df5['y'],df5['z'], c = 'y')
ax.scatter(df6['x'],df6['y'],df6['z'], c = 'k')
plt.show()
In [30]:
df0 = df[df.clusterGroup == 0]
df1 = df[df.clusterGroup == 1]
df2 = df[df.clusterGroup == 2]
df3 = df[df.clusterGroup == 3]
df4 = df[df.clusterGroup == 4]
df5 = df[df.clusterGroup == 5]
df6 = df[df.clusterGroup == 6]
fig = plt.figure()
ax = fig.gca(projection='3d')
ax.scatter(df0['x'],df0['y'],df0['z'], c = 'b')
plt.xlabel('X Coordinate', labelpad = 30)
plt.ylabel('Y Coordinate',labelpad = 30)
plt.gca().set_zlabel('Z Coordinate',labelpad = 30)
plt.title("Cluster Group: 0")
plt.show()
fig = plt.figure()
ax = fig.gca(projection='3d')
ax.scatter(df1['x'],df1['y'],df1['z'], c = 'g')
plt.xlabel('X Coordinate',labelpad = 30)
plt.ylabel('Y Coordinate',labelpad = 30)
plt.gca().set_zlabel('Z Coordinate',labelpad = 30)
plt.title("Cluster Group: 1")
plt.show()
fig = plt.figure()
ax = fig.gca(projection='3d')
ax.scatter(df2['x'],df2['y'],df2['z'], c = 'r')
plt.xlabel('X Coordinate',labelpad = 300)
plt.ylabel('Y Coordinate',labelpad = 300)
plt.gca().set_zlabel('Z Coordinate',labelpad = 300)
plt.title("Cluster Group: 2")
plt.show()
fig = plt.figure()
ax = fig.gca(projection='3d')
ax.scatter(df3['x'],df3['y'],df3['z'], c = 'c')
plt.xlabel('X Coordinate',labelpad = 30)
plt.ylabel('Y Coordinate',labelpad = 30)
plt.gca().set_zlabel('Z Coordinate',labelpad = 30)
plt.title("Cluster Group: 3")
plt.show()
fig = plt.figure()
ax = fig.gca(projection='3d')
ax.scatter(df4['x'],df4['y'],df4['z'], c = 'm')
plt.xlabel('X Coordinate',labelpad = 30)
plt.ylabel('Y Coordinate',labelpad = 30)
plt.gca().set_zlabel('Z Coordinate',labelpad = 30)
plt.title("Cluster Group: 4")
plt.show()
fig = plt.figure()
ax = fig.gca(projection='3d')
ax.scatter(df5['x'],df5['y'],df5['z'], c = 'y')
plt.xlabel('X Coordinate',labelpad = 30)
plt.ylabel('Y Coordinate',labelpad = 30)
plt.gca().set_zlabel('Z Coordinate',labelpad = 30)
plt.title("Cluster Group: 5")
plt.show()
fig = plt.figure()
ax = fig.gca(projection='3d')
ax.scatter(df6['x'],df6['y'],df6['z'], c = 'w')
plt.xlabel('X Coordinate',labelpad = 30)
plt.ylabel('Y Coordinate',labelpad = 30)
plt.gca().set_zlabel('Z Coordinate',labelpad = 30)
plt.title("Cluster Group: 6")
plt.show()
In [31]:
data = open('../data/data.csv', 'r').readlines()
fieldnames = ['x', 'y', 'z', 'unmasked', 'synapses']
reader = csv.reader(data)
reader.next()
rows = [[int(col) for col in row] for row in reader]
In [32]:
sorted_x = sorted(list(set([r[0] for r in rows])))
sorted_y = sorted(list(set([r[1] for r in rows])))
sorted_z = sorted(list(set([r[2] for r in rows])))
In [64]:
data = df.as_matrix()
volume = np.zeros((len(sorted_y),len(sorted_x), len(sorted_z)))
for row in data:
volume[sorted_y.index(row[1]), sorted_x.index(row[0]), sorted_z.index(row[2])] = row[-1]+1
#print lenvolume[0]
for ar in volume:
fig = plt.figure()
ax = sns.heatmap(ar, yticklabels=False, xticklabels=False)
plt.show()
In [65]:
print volume[0][0]
print len(volume[1])
print len(volume)
In [33]:
for i in sorted_y:
data = df[df.y == i]
df0 = data[data.clusterGroup == 0]
df1 = data[data.clusterGroup == 1]
df2 = data[data.clusterGroup == 2]
df3 = data[data.clusterGroup == 3]
df4 = data[data.clusterGroup == 4]
df5 = data[data.clusterGroup == 5]
df6 = data[data.clusterGroup == 6]
fig = plt.figure()
ax = fig.add_subplot(111)
ax.scatter(df0['x'],df0['z'], c = 'b')
ax.scatter(df1['x'],df1['z'], c = 'g')
ax.scatter(df2['x'],df2['z'], c = 'r')
ax.scatter(df3['x'],df3['z'], c = 'c')
ax.scatter(df4['x'],df4['z'], c = 'm')
ax.scatter(df5['x'],df5['z'], c = 'y')
ax.scatter(df6['x'],df6['z'], c = 'w')
plt.xlabel('X Coordinate', labelpad = 30)
plt.ylabel('Z Coordinate',labelpad = 30)
plt.title("Y Layer: " + str(i))
plt.show()
In [ ]: